Setup

library(tidyverse)
library(magrittr)
library(ngsReports)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(AnnotationHub)
library(ensembldb)
library(edgeR)
library(corrplot)
library(cqn)
library(DT)
library(Gviz)
library(AnnotationFilter)
library(Rsamtools)
if (interactive()) setwd(here::here())
theme_set(theme_bw())

Sample information

files1 <- list.files(
  path = "0_rawData/FastQC",
  pattern = "zip",
  full.names = TRUE
)
files2 <- list.files(
  path = "../20170327_Psen2S4Ter_RNASeq/data/0_rawData/FastQC",
  pattern = "zip",
  full.names = TRUE
) %>%
  str_subset("R1_fastqc.zip")
filesFull <- c(files1, files2)
files <- filesFull %>%
  basename()
samples <- tibble(
  sample = str_remove(files, "_fastqc.zip"),
  dataset = NA,
  organism = NA
) %>%
  mutate(
    dataset = ifelse(
      str_detect(sample, "SRR213"), "E-GEOD-71609", dataset
    ),
    dataset = ifelse(
      str_detect(sample, "SRR218"), "E-GEOD-72322", dataset
    ),
    dataset = ifelse(
      str_detect(sample, "Ps2Ex"), "Psen2S4Ter", dataset
    ),
    sample = ifelse(
      str_detect(sample, "Ps2Ex"), str_remove(sample, "_R1"), sample
    ),
    organism = ifelse(
      str_detect(sample, "(SRR213|SRR218|Ps2Ex)"), "zebrafish", organism
    ),
    dataset = ifelse(
      str_detect(sample, "ERR313"), "E-MTAB-7636", dataset
    ),
    dataset = ifelse(
      str_detect(sample, "ERR268"), "E-MTAB-6972", dataset
    ),
    organism = ifelse(
      str_detect(sample, "(ERR313|ERR268)"), "mouse", organism
    )
  )
datasets <- samples$dataset %>% 
  unique()

Sequence information

ah_Dr <- AnnotationHub() %>%
  subset(species == "Danio rerio") %>%
  subset(rdataclass == "EnsDb")
ensDb_Dr <- ah_Dr[["AH74989"]]
trEns_Dr <- transcripts(ensDb_Dr) %>%
  mcols() %>% 
  as_tibble()
trLen_Dr <- exonsBy(ensDb_Dr, "tx") %>%
  width() %>%
  vapply(sum, integer(1))
geneGcLen_Dr <- trLen_Dr %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Dr) %>%
  group_by(gene_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
trGcLen_Dr <- trLen_Dr %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Dr) %>%
  group_by(tx_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
genesGR_Dr <- genes(ensDb_Dr)
mcols(genesGR_Dr) <- mcols(genesGR_Dr)[c("gene_id", "gene_name", 
                                         "gene_biotype", "entrezid")]
txGR_Dr <- transcripts(ensDb_Dr)
mcols(txGR_Dr) <- mcols(txGR_Dr)[c("tx_id", "tx_name", 
                                   "tx_biotype", "tx_id_version", "gene_id")]
ah_Mm <- AnnotationHub() %>%
  subset(species == "Mus musculus") %>%
  subset(rdataclass == "EnsDb")
ensDb_Mm <- ah_Mm[["AH75036"]]
trEns_Mm <- transcripts(ensDb_Mm) %>%
  mcols() %>% 
  as_tibble()
trLen_Mm <- exonsBy(ensDb_Mm, "tx") %>%
  width() %>%
  vapply(sum, integer(1))
geneGcLen_Mm <- trLen_Mm %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Mm) %>%
  group_by(gene_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
trGcLen_Mm <- trLen_Mm %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Mm) %>%
  group_by(tx_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
genesGR_Mm <- genes(ensDb_Mm)
mcols(genesGR_Mm) <- mcols(genesGR_Mm)[c("gene_id", "gene_name", 
                                         "gene_biotype", "entrezid")]
ah_Hs <- AnnotationHub() %>%
  subset(species == "Homo sapiens") %>%
  subset(rdataclass == "EnsDb")
ensDb_Hs <- ah_Hs[["AH75011"]]
trEns_Hs <- transcripts(ensDb_Hs) %>%
  mcols() %>% 
  as_tibble()
trLen_Hs <- exonsBy(ensDb_Hs, "tx") %>%
  width() %>%
  vapply(sum, integer(1))
geneGcLen_Hs <- trLen_Hs %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Hs) %>%
  group_by(gene_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
trGcLen_Hs <- trLen_Hs %>%
  enframe() %>%
  set_colnames(c("tx_id", "length")) %>%
  left_join(trEns_Hs) %>%
  group_by(tx_id) %>% 
  summarise(
    aveLen = mean(length),
    maxLen = max(length), 
    aveGc = sum(length * gc_content) / sum(length),
    longestGc = gc_content[which.max(length)[[1]]]
  ) %>%
  mutate(
    aveGc =  aveGc / 100,
    longestGc = longestGc / 100
  )
genesGR_Hs <- genes(ensDb_Hs)
mcols(genesGR_Hs) <- mcols(genesGR_Hs)[c("gene_id", "gene_name", 
                                         "gene_biotype", "entrezid")]

An EnsDb object was obtained for Ensembl release 98 using the AnnotationHub package. This provided the GC content and length for every gene and transcript in the release.

  • For zebrafish, this consisted of 37241 genes and 65905 transcripts.
  • For mouse, 56239 genes and 144524 transcripts.
  • For human, 67946 genes and 250194 transcripts.

Raw data

Raw data was sourced from publically available datasets on ArrayExpress. Samples from zebrafish (Danio rerio) were chosen for an initial inspection as it is expected that rRNA sequences are more divergent from commonly chosen model organisms such as mice. Hence, inefficient ribosomal RNA (rRNA) depletion by kits that are generally optimised for these common model organisms may produce a stronger signal in zebrafish.

Following an initial inspection, further analysis on human and mouse datasets was performed.

The following analysis involves 92 samples across 5 datasets: E-MTAB-6972, E-MTAB-7636, E-GEOD-71609, E-GEOD-72322, Psen2S4Ter.

Library sizes

rawFqc <- filesFull %>%
  FastqcDataList()
data1 <- grepl("SRR213", fqName(rawFqc))
rawLib1 <- plotReadTotals(rawFqc[data1]) +
  labs(subtitle = "E-GEOD-71609")
data2 <- grepl("SRR218", fqName(rawFqc))
rawLib2 <- plotReadTotals(rawFqc[data2]) +
  labs(subtitle = "E-GEOD-72322")
data3 <- grepl("Ps2Ex", fqName(rawFqc))
labels3 <- rawFqc[data3] %>%
  fqName() %>%
  str_remove("_6month_07_07_2016_F3") %>%
  str_remove("\\.fastq\\.gz") %>%
  str_remove("Ps2Ex3M1_")
rawLib3 <- plotReadTotals(rawFqc[data3]) +
  labs(subtitle = "Psen2S4Ter") + 
  scale_x_discrete(labels = labels3)
data4 <- grepl("ERR313", fqName(rawFqc))
rawLib4 <- plotReadTotals(rawFqc[data4]) +
  labs(subtitle = "E-MTAB-7636") 
data5 <- grepl("ERR268", fqName(rawFqc))
rawLib5 <- plotReadTotals(rawFqc[data5]) +
  labs(subtitle = "E-MTAB-6972")

The library sizes of unprocessed datasets ranged between 13,269,891 and 100,459,213 reads.

ggarrange(rawLib1, rawLib2, rawLib3, rawLib4, rawLib5, ncol = 1, nrow = 5)

GC content

rRNA transcripts are known to have high GC content. Therefore, surveying the GC content of the raw reads serves as a good starting point for detecting incomplete rRNA removal. A spike in GC content at > 70% is expected if this is the case.

plotly::ggplotly(
  plotGcContent(
    x = rawFqc[data1], 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Drerio"
  ) +
    labs(title = "Dataset 1, E-GEOD-71609 (D. rerio)") + 
    theme(legend.position="none")
)

GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.

plotly::ggplotly(
  plotGcContent(
    x = rawFqc[data2], 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Drerio"
  ) +
    labs(title = "Dataset 2, E-GEOD-72322 (D. rerio)") + 
    theme(legend.position="none")
)

GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.

plotly::ggplotly(
  plotGcContent(
    x = rawFqc[data3], 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Drerio"
  ) +
    labs(title = "Dataset 3, Psen2S4Ter (D. rerio)") + 
    theme(legend.position="none")
) 

GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.

plotly::ggplotly(
  plotGcContent(
    x = rawFqc[data4], 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Mmusculus"
  ) +
    labs(title = "Dataset 4, E-MTAB-7636 (M. musculus)") + 
    theme(legend.position="none")
)

GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.

plotly::ggplotly(
  plotGcContent(
    x = rawFqc[data5], 
    plotType = "line",
    gcType = "Transcriptome",
    species = "Mmusculus"
  ) +
    labs(title = "Dataset 5, E-MTAB-6972 (M. musuculus)") + 
    theme(legend.position="none")
)

GC content of reads in the dataset. No obvious spikes above 70% GC are observed, however unusual distibution shapes in some samples are worth further exploration.

Overrepresented sequences

The top 30 overrepresented sequences were analysed using blastn and were found to be predominantly rRNA sequences.

getModule(rawFqc, "Overrep") %>% 
  group_by(Sequence, Possible_Source) %>% 
  summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>% 
  arrange(desc(`Highest Percentage`), desc(`Found In`)) %>% 
  ungroup() %>% 
  dplyr::slice(1:30) %>%
  mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
  kable(
    align = "llrr", 
    caption = paste(
      "Top", nrow(.),"Overrepresented sequences.",
      "The number of samples they were found in is shown,",
      "along with the percentage of the most 'contaminated' sample."
    )
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
Top 30 Overrepresented sequences. The number of samples they were found in is shown, along with the percentage of the most ‘contaminated’ sample.
Sequence Possible_Source Found In Highest Percentage
GTGGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGC No Hit 12 1.72%
CTCAGACAGGCGTAGCCCCGGGAGGAACCCGGGGCCGCAAGTGCGTTCGAA No Hit 13 1.43%
CCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATCC No Hit 24 1.32%
GGCCCGGCGCACGTCCAGAGTCGCCGCCGCACACCGCAGCGCATCCCCCC No Hit 9 1.31%
CTCGGCTCGTGCGTCGATGAAGAACGCAGCTAGCTGCGAGAATTAATGTGA No Hit 11 1.17%
CTCCTGAAAAGGTTGTATCCTTTGTTAAAGGGGCTGTACCCTCTTTAACT No Hit 11 1.11%
GGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATC No Hit 12 1.09%
GGGGTGTACGAAGCTGAACTTTTATTCATCTCCCAGACAACCAGCTATTG No Hit 12 1.07%
GGCCCGGCGCACGTCCAGAGTCGCCGCCGCGCACCGCAGCGCATCCCCCC No Hit 10 1.03%
CCAGGCTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATCAG No Hit 25 0.85%
CCTCCTTCAAGTATTGTTTCATGTTACATTTTCGTATATTCTGGGGTAGA No Hit 12 0.82%
GGGAGTTTGACTGGGGCGGTACACCTGTCAAACGGTAACGCAGGTGTCCT No Hit 22 0.82%
CCCGCTGTATTACTCAGGCTGCACTGCAGTGTCTATTCACAGGCGCGATC No Hit 12 0.79%
GTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCATCT No Hit 12 0.79%
GGGAGATACCATGATCACGAAGGTGGTTTTCCCAGGGCGAGGCTTATCCAT No Hit 8 0.78%
GGGTTCAGGTAATTAATTTAAAGCTACTTTCGTGTTTGGGCCTCTAGCAT No Hit 12 0.78%
CGGGTCGGGTGGGTGGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG No Hit 12 0.73%
AGCAAATATTCAAACGAGAACTTTGAAGGCCGAAGTGGAGAAGGCTTCCA No Hit 10 0.73%
CACAAATTATGCAGTCGAGTTTCCCGCATTTGGGGAAATCGCAGGGGTCAG No Hit 7 0.69%
GGGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTT No Hit 11 0.68%
CAAATATTCAAACGAGAACTTTGAAGGCCGAAGTGGAGAAGGCTTCCATG No Hit 7 0.66%
CGACGCTCAGACAGGCGTAGCCCCGGGAGGAACCCGGGGCCGCAAGTGCGT No Hit 8 0.66%
CGCACGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGACAGGAGGATCGCTT No Hit 25 0.58%
CTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATCAGCACGG No Hit 23 0.58%
CGGGTCGGGTGGGTAGCCGGCATCACCGCGGACCTCGGGCGCCCTTTTGG No Hit 12 0.57%
CTTAGACGACCTGGTAGTCCAAGGCTCCCCCAGGAGCACCATATCGATAC No Hit 19 0.54%
GTTGGATTGTTCACCCACTAATAGGGAACGTGAGCTGGGTTTAGACCGTC No Hit 10 0.53%
CTCGTTTGGTTTCGGGGTTTCTAGCTGTAATTCTTTTAGTTAGAAGTTTTC No Hit 16 0.53%
AGCTGGGGAGATCCGCGAGAAGGGCCCGGCGCACGTCCAGAGTCGCCGCC No Hit 11 0.53%
GGCCTCTAGCATCTAAAAGCGTATAACAGTTAAAGGGCCGTTTGGCTTTA No Hit 10 0.53%

Trimming

trimFqc1 <- list.files(
  path = "1_trimmedData/FastQC", 
  pattern = "zip", 
  full.names = TRUE
) 
trimFqc2 <- list.files(
  path = "../20170327_Psen2S4Ter_RNASeq/data/1_trimmedData/FastQC",
  pattern = "zip",
  full.names = TRUE
)
trimFqc <- c(trimFqc1, trimFqc2) %>%
  FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
  dplyr::rename(Raw = Total_Sequences) %>%
  left_join(readTotals(trimFqc), by = "Filename") %>%
  dplyr::rename(Trimmed = Total_Sequences) %>%
  mutate(
    Discarded = 1 - Trimmed/Raw,
    Retained = Trimmed / Raw
  )

After trimming of adapters between 0.00% and 12.91% of reads were discarded.

Aligned data

Trimmed reads were firstly aligned to rRNA sequences using the BWA-MEM algorithm to calculate the proportion of reads that were of rRNA origin within each sample. BWA-MEM is recommended for high-quality queries of reads ranging from 70bp to 1Mbp as it is faster and more accurate that alternative algorithms BWA-backtrack and BWA-SW.

rRNA proportions

prop1 <- samples %>%
  dplyr::filter(dataset == "E-GEOD-71609") %>%
  # cbind(tibble(proportion = c(0.0344, 0.0339, 0.0409, 0.0347, 0.0362, 0.0397, 0.0288, 0.0346, 0.0429, 0.0297))) 
  cbind(tibble(proportion = c(0.0363, 0.0367, 0.0451, 0.0367, 0.0388, 0.0427, 0.0307, 0.0369, 0.0468, 0.0312)))
prop2 <- samples %>%
  dplyr::filter(dataset == "E-GEOD-72322") %>%
  # cbind(tibble(proportion = c(0.1465, 0.1048, 0.1304, 0.1498, 0.1136, 0.1274, 0.1206, 0.1318, 0.1351, 0.1385, 0.1456, 0.1353, 0.1634, 0.1480, 0.1492, 0.1361, 0.1683, 0.1709, 0.1639, 0.1654, 0.1940, 0.1782, 0.1837, 0.1666)))
  cbind(tibble(proportion = c(0.1673, 0.1060, 0.1410, 0.1758, 0.1154, 0.1292, 0.1213, 0.1361, 0.1368, 0.1449, 0.1510, 0.1370, 0.1745, 0.1580, 0.1562, 0.1441, 0.1743, 0.1749, 0.1715, 0.1713, 0.2110, 0.1887, 0.1895, 0.1739)))
prop3 <- samples %>%
  dplyr::filter(dataset == "Psen2S4Ter") %>%
  cbind(tibble(proportion = c(0.2291, 0.2504, 0.2615, 0.2260, 0.2215, 0.2128, 0.2468, 0.2284, 0.1366, 0.1188, 0.1892, 0.1608))) %>%
  mutate(
    sample = str_remove(sample, "_6month_07_07_2016_F3"),
    sample = str_remove(sample, "Ps2Ex3M1_")
  )
prop4 <- samples %>%
  dplyr::filter(dataset == "E-MTAB-7636") %>%
  cbind(tibble(proportion = c(0.0614, 0.0517, 0.0458, 0.0633, 0.0547, 0.0564, 0.0525, 0.0520, 0.0551, 0.0513, 0.0473, 0.0448, 0.0407, 0.0238, 0.0499, 0.0307, 0.0383, 0.0178, 0.0232, 0.0384, 0.0461, 0.0545, 0.0389, 0.0353, 0.0866, 0.0763)))
prop5 <- samples %>%
  dplyr::filter(dataset == "E-MTAB-6972") %>%
  cbind(tibble(proportion = c(0.0123, 0.0122, 0.0121, 0.0121, 0.0121, 0.0131, 0.0131, 0.0131, 0.0130, 0.0131, 0.0929, 0.0924, 0.0931, 0.0929, 0.0927, 0.0224, 0.0223, 0.0222, 0.0223, 0.0222)))
rRnaProp <- rbind(prop1, prop2, prop3, prop4, prop5) 
rRnaProp$dataset %<>%
  factor(levels = c("E-GEOD-71609", "E-GEOD-72322", "Psen2S4Ter", "E-MTAB-7636", "E-MTAB-6972"))
rRnaProp %>%
  ggplot(aes(sample, proportion)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~dataset, scales = "free_x") +
  scale_y_continuous(labels = percent) +
  labs(x = "Sample", y = "Percent of Total") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
*Percentages of each library that align to rRNA sequences with `bwa mem`.*

Percentages of each library that align to rRNA sequences with bwa mem.

Following alignment to rRNA sequences, reads that mapped to rRNA sequences were removed and a separate fastq file was created.

These sequences were aligned to the Danio rerio GRCz11 genome (Ensembl release 98) using STAR 2.7.0d and summarised with featureCounts.

Gene GC content and length

dgeList_1 <- read_tsv("3_alignedDataStar/featureCounts/genes_Dr.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  dplyr::select(str_subset(colnames(.), "SRR213")) %>%
  DGEList() %>%
  calcNormFactors()
dgeList_2 <- read_tsv("3_alignedDataStar/featureCounts/genes_Dr.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  dplyr::select(str_subset(colnames(.), "SRR218")) %>%
  DGEList() %>%
  calcNormFactors()
dgeList_3 <- read_tsv("../20170327_Psen2S4Ter_RNASeq/data/2_alignedData/featureCounts/genes.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  set_colnames(str_remove(colnames(.), "_6month_07_07_2016_F3")) %>%
  set_colnames(str_remove(colnames(.), "Ps2Ex3M1_")) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  DGEList() %>%
  calcNormFactors()
dgeList_4 <- read_tsv("3_alignedDataStar/featureCounts/genes_Mm.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  dplyr::select(str_subset(colnames(.), "ERR313")) %>%
  DGEList() %>%
  calcNormFactors()
dgeList_5 <- read_tsv("3_alignedDataStar/featureCounts/genes_Mm.out") %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  dplyr::select(str_subset(colnames(.), "ERR268")) %>%
  DGEList() %>%
  calcNormFactors()
dgeList_1$genes <- genesGR_Dr[rownames(dgeList_1),]
mcols(dgeList_1$genes) %<>% 
  as.data.frame() %>% 
  left_join(geneGcLen_Dr) 
dgeList_1$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(rRnaProp) %>%
  column_to_rownames("rowname")
dgeList_1$samples$filenames <- read_tsv(
  "3_alignedDataStar/featureCounts/genes_Dr.out"
) %>% 
  dplyr::select(str_subset(colnames(.), "SRR213")) %>%
  colnames() 

dgeList_2$genes <- genesGR_Dr[rownames(dgeList_2),]
mcols(dgeList_2$genes) %<>% 
  as.data.frame() %>% 
  left_join(geneGcLen_Dr)
dgeList_2$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(rRnaProp) %>%
  column_to_rownames("rowname")
dgeList_2$samples$filenames <- read_tsv(
  "3_alignedDataStar/featureCounts/genes_Dr.out"
) %>% 
  dplyr::select(str_subset(colnames(.), "SRR218")) %>%
  colnames()

dgeList_3$genes <- genesGR_Dr[rownames(dgeList_3),]
mcols(dgeList_3$genes) %<>% 
  as.data.frame() %>% 
  left_join(geneGcLen_Dr)
dgeList_3$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(rRnaProp) %>%
  column_to_rownames("rowname")
## file paths need to be changed to current filepath 
## the original filepath (that featureCounts used) is different to the current
dgeList_3$samples$filenames <- read_tsv(
  "../20170327_Psen2S4Ter_RNASeq/data/2_alignedData/featureCounts/genes.out"
) %>% 
  dplyr::select(str_subset(colnames(.), "Ps2Ex3M1_")) %>%
  colnames() %>%
  str_replace(., "/2_al", "/data/2_al")

dgeList_4$genes <- genesGR_Mm[rownames(dgeList_4),]
mcols(dgeList_4$genes) %<>% 
  as.data.frame() %>% 
  left_join(geneGcLen_Mm)
dgeList_4$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(rRnaProp) %>%
  column_to_rownames("rowname")
dgeList_4$samples$filenames <- read_tsv(
  "3_alignedDataStar/featureCounts/genes_Mm.out"
) %>% 
  dplyr::select(str_subset(colnames(.), "ERR313")) %>%
  colnames()

dgeList_5$genes <- genesGR_Mm[rownames(dgeList_5),]
mcols(dgeList_5$genes) %<>% 
  as.data.frame() %>% 
  left_join(geneGcLen_Mm)
dgeList_5$samples %<>%
  rownames_to_column("rowname") %>%
  mutate(sample = rowname) %>%
  left_join(rRnaProp) %>%
  column_to_rownames("rowname")
dgeList_5$samples$filenames <- read_tsv(
  "3_alignedDataStar/featureCounts/genes_Mm.out"
) %>% 
  dplyr::select(str_subset(colnames(.), "ERR268")) %>%
  colnames()
gcInfo_Dr <- function(x) {
  x$counts %>%
    as.data.frame() %>%
    rownames_to_column("gene_id") %>%
    as_tibble() %>%
    pivot_longer(
      cols = colnames(x), 
      names_to = "sample", 
      values_to = "counts"
    ) %>%
    dplyr::filter(
      counts > 0
    ) %>%
    left_join(
      geneGcLen_Dr
    ) %>%
    dplyr::select(
      ends_with("id"), sample, counts, aveGc, maxLen
    ) %>%
    split(f = .$sample) %>%
    lapply(
      function(x){
        DataFrame(
          gc = Rle(x$aveGc, x$counts),
          logLen = Rle(log10(x$maxLen), x$counts)
        )
      }
    ) 
}
gcInfo_Mm <- function(x) {
  x$counts %>%
    as.data.frame() %>%
    rownames_to_column("gene_id") %>%
    as_tibble() %>%
    pivot_longer(
      cols = colnames(x), 
      names_to = "sample", 
      values_to = "counts"
    ) %>%
    dplyr::filter(
      counts > 0
    ) %>%
    left_join(
      geneGcLen_Mm
    ) %>%
    dplyr::select(
      ends_with("id"), sample, counts, aveGc, maxLen
    ) %>%
    split(f = .$sample) %>%
    lapply(
      function(x){
        DataFrame(
          gc = Rle(x$aveGc, x$counts),
          logLen = Rle(log10(x$maxLen), x$counts)
        )
      }
    ) 
}
gcSummary <- function(x) {
  x %>%
    vapply(function(x){
      c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
    }, numeric(4)
    ) %>%
    t() %>%
    set_colnames(
      c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
    ) %>%
    as.data.frame() %>%
    rownames_to_column("sample") %>%
    as_tibble()
}
rle_1 <- gcInfo_Dr(dgeList_1)
rle_2 <- gcInfo_Dr(dgeList_2)
rle_3 <- gcInfo_Dr(dgeList_3)
rle_4 <- gcInfo_Mm(dgeList_4)
rle_5 <- gcInfo_Mm(dgeList_5)
sum_1 <- gcSummary(rle_1)
sum_2 <- gcSummary(rle_2)
sum_3 <- gcSummary(rle_3)
sum_4 <- gcSummary(rle_4)
sum_5 <- gcSummary(rle_5)
a_1 <- sum_1 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_logLen)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean log(Length)"
  ) 
b_1 <- sum_1 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_gc)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean GC Content"
  ) 
ggarrange(a_1, b_1, ncol = 2, nrow = 1) %>%
  annotate_figure("Dataset 1, E-GEOD-71609 (D. rerio)")
*Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*

Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.

a_2 <- sum_2 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_logLen)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean log(Length)"
  ) 
b_2 <- sum_2 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_gc)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean GC Content"
  )
ggarrange(a_2, b_2, ncol = 2, nrow = 1) %>%
  annotate_figure("Dataset 2, E-GEOD-72322 (D. rerio)")
*Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*

Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.

a_3 <- sum_3 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_logLen)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean log(Length)"
  ) 
b_3 <- sum_3 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_gc)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean GC Content"
  )
ggarrange(a_3, b_3, ncol = 2, nrow = 1) %>%
  annotate_figure("Dataset 3, PsenS4Ter (D. rerio)")
*Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*

Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.

a_4 <- sum_4 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_logLen)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean log(Length)"
  ) 
b_4 <- sum_4 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_gc)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean GC Content"
  )
ggarrange(a_4, b_4, ncol = 2, nrow = 1) %>%
  annotate_figure("Dataset 4, E-MTAB-7636 (M. musculus)")
*Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*

Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.

a_5 <- sum_5 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_logLen)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean log(Length)"
  ) 
b_5 <- sum_5 %>%
  left_join(rRnaProp) %>%
  ggplot(aes(proportion, mn_gc)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = percent) +
  labs(
    x = "rRNA Proportion of Initial Library",
    y = "Mean GC Content"
  )
ggarrange(a_5, b_5, ncol = 2, nrow = 1) %>%
  annotate_figure("Dataset 5, E-MTAB-6972 (M. musculus)")
*Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.*

Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.

PCA

genes2keep_3 <- dgeList_3 %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(6)
dgeFilt_3 <- dgeList_3[genes2keep_3,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()
pca_3 <- cpm(dgeFilt_3, log = TRUE) %>%
  t() %>%
  prcomp()
pcaCor_3 <- pca_3$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sum_3) %>%
  as_tibble() %>% 
  left_join(rRnaProp) %>%
  dplyr::select(
    PC1, PC2, PC3, 
    Mean_GC = mn_gc, 
    Mean_Length = mn_logLen, 
    rRna_Proportion = proportion
  ) %>% 
  cor()

Dataset 3 (D. rerio)

a_3 <- pca_3$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  as_tibble() %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(size = 2) +
  labs(
    x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
    y = paste0("PC2 (", percent(summary(pca_3)$importance["Proportion of Variance","PC2"]),")")
  )
b_3 <- pca_3$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(rRnaProp) %>%
  ggplot(aes(PC1, proportion)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  labs(
    x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
    y = "rRNA Proportion of Initial Library"
  )
c_3 <- pca_3$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sum_3) %>%
  as_tibble() %>%
  ggplot(aes(PC1, mn_logLen)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm") +
  labs(
    x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
    y = "Mean log(Length)"
  )
d_3 <- pca_3$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(sum_3) %>%
  as_tibble() %>%
  ggplot(aes(PC1, mn_gc)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm") +
  scale_y_continuous(labels = percent) +
  labs(
    x = paste0("PC1 (", percent(summary(pca_3)$importance["Proportion of Variance","PC1"]),")"),
    y = "Mean GC"
  )
ggarrange(a_3, b_3, c_3, d_3, ncol = 2, nrow = 2) %>%
  annotate_figure("Psen2S4Ter")
*PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.*

PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.

corrplot(
  pcaCor_3,
  type = "lower", 
  diag = FALSE, 
  addCoef.col = 1, addCoefasPercent = TRUE
)
*Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.*

Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.

Differential abundance

design_3 <- model.matrix(~proportion, data = dgeFilt_3$samples)
voom_3 <- voom(dgeFilt_3, design = design_3)
fit_3 <- lmFit(voom_3, design = design_3)
eBayes_3 <- eBayes(fit_3)
topTable_3 <- eBayes_3 %>%
  topTable(coef = colnames(design_3)[2], sort.by = "p", n = Inf) %>%
  set_colnames(str_remove(colnames(.), "ID\\.")) %>%
  mutate(DE = adj.P.Val < 0.05) %>%
  unite(Location, c(seqnames, start, end, width, strand), sep = ":") %>%
  dplyr::select(
    Geneid = gene_id,
    Symbol = gene_name,
    AveExpr,
    logFC,
    P.Value,
    FDR = adj.P.Val,
    Location,
    t,
    DE,
    everything(),
    -B
  ) %>%
  as_tibble()

Dataset 3 (D. rerio)

topTable_3 %>% 
  dplyr::select(Geneid, Symbol, AveExpr, logFC, P.Value, FDR, DE) %>%
  mutate(
    AveExpr = format(round(AveExpr, 2), nsmall = 2),
    logFC = format(round(logFC, 2), nsmall = 2),
    P.Value = sprintf("%.2e", P.Value),
    FDR = sprintf("%.2e", FDR)
  ) %>%
  dplyr::slice(1:200) %>%
  datatable(options = list(pageLength = 20), class = "striped hover condensed responsive", filter = "top")

Transcript counts

dirs_3 <- list.dirs(
  "~/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant/", 
  recursive = FALSE
)
salmon_3 <- catchSalmon(paths = dirs_3)
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_80_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_81_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_84_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Heter_6month_07_07_2016_F3_89_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_78_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_82_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_83_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_Hom_6month_07_07_2016_F3_85_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_79_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_86_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_88_Fem, 51605 transcripts, 0 bootstraps
## Reading /Users/lachlanbaer/Documents/Bioinformatics/Projects/20170327_Psen2S4Ter_RNASeq/data/5_salmon/quant//Ps2Ex3M1_WT_6month_07_07_2016_F3_95_Fem, 51605 transcripts, 0 bootstraps
dgeListTx_3 <- salmon_3$counts %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "_6month_07_07_2016_F3")) %>%
  set_colnames(str_remove(colnames(.), "Ps2Ex3M1_")) %>%
  as.data.frame() %>%
  rownames_to_column("tx_id") %>%
  mutate(tx_id = str_remove(tx_id, "\\.[0-9]+$")) %>%
  column_to_rownames("tx_id") %>%
  DGEList() %>%
  calcNormFactors()
genes2keep_3 <- dgeListTx_3 %>%
  cpm() %>%
  is_greater_than(1) %>%
  rowSums() %>%
  is_weakly_greater_than(6)
dgeFiltTx_3 <- dgeListTx_3[genes2keep_3,, keep.lib.sizes = FALSE] %>%
  calcNormFactors()

k-mer analysis

Export fasta

# deIds <- topTable_3 %>%
#   dplyr::filter(DE) %>%
#   .$Geneid
# notIds <- topTable_3 %>%
#   dplyr::filter(!DE) %>%
#   .$Geneid
# dna <- ensembldb::getGenomeTwoBitFile(ensDb_Dr)
# deExons <- exonsBy(ensDb_Dr, by = "gene") %>%
#   .[deIds] %>%
#   lapply(function(x){
#     GenomicRanges::reduce(x)
#   }) %>%
#   GRangesList()
# deStrings <- lapply(deExons, function(x){
#   getSeq(dna, x) %>%
#     unlist()
# }
# )
# deStringSet <- DNAStringSet(deStrings)
# writeXStringSet(
#   deStringSet,
#   "/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/deSeqs.fa"
# )
# conExons <- exonsBy(ensDb_Dr, by = "gene") %>%
#   .[conIds] %>%
#   lapply(function(x){
#     GenomicRanges::reduce(x)
#   }) %>%
#   GRangesList()
# conStrings <- lapply(conExons, function(x){
#   getSeq(dna, x) %>%
#     unlist()
# }
# )
# conStringSet <- DNAStringSet(conStrings)
# writeXStringSet(
#   conStringSet,
#   "/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/conSeqs.fa"
# )

Load kmer counts

loadMer <- function(x){
  read.table(
    paste0(
      "/data/biohub/20170327_Psen2S4Ter_RNASeq/data/6_jellyfish/", 
      x, 
      "_dumps.txt"
    ),
    col.names = c("mer", "count")
  ) %>%
    dplyr::arrange(-count) %>%
    as_tibble()
}
merDek5 <- loadMer("dek5")
merDek6 <- loadMer("dek6")
merDek7 <- loadMer("dek7")
merDek8 <- loadMer("dek8")
merDek9 <- loadMer("dek9")
merDek10 <- loadMer("dek10")
merConk5 <- loadMer("conk5")
merConk6 <- loadMer("conk6")
merConk7 <- loadMer("conk7")
merConk8 <- loadMer("conk8")
merConk9 <- loadMer("conk9")
merConk10 <- loadMer("conk10")

Enrichment testing

x <- merDek10
y <- merConk10
merRatio <- function(x, y) {
  join <- x %>% 
    full_join(y, by = "mer")
  join[is.na(join)] <- 0
  join %>%
    mutate(
      p.x = count.x / sum(count.x), 
      p.y = count.y / sum(count.y), 
      ratio = p.x / p.y
    ) %>% 
    arrange(desc(ratio))
}
merRatk5 <- merRatio(merDek5, merConk5)
merRatk6 <- merRatio(merDek6, merConk6)
merRatk7 <- merRatio(merDek7, merConk7)
merRatk8 <- merRatio(merDek8, merConk8)
merRatk9 <- merRatio(merDek9, merConk9)
merRatk10 <- merRatio(merDek10, merConk10)
plotRatk5 <- merRatk5 %>%
  ggplot(aes(p.x, p.y)) +
  geom_point(size = 1) +
  labs(x = "DE", y = "Not DE", title = "k = 5") +
  scale_x_sqrt() +
  scale_y_sqrt() +
  geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk6 <- merRatk6 %>%
  ggplot(aes(p.x, p.y)) +
  geom_point(size = 1) +
  labs(x = "DE", y = "Not DE", title = "k = 6") +
  scale_x_sqrt() +
  scale_y_sqrt() +
  geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk7 <- merRatk7 %>%
  ggplot(aes(p.x, p.y)) +
  geom_point(size = 1) +
  labs(x = "DE", y = "Not DE", title = "k = 7") +
  scale_x_sqrt() +
  scale_y_sqrt() +
  geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk8 <- merRatk8 %>%
  ggplot(aes(p.x, p.y)) +
  geom_point(size = 1) +
  labs(x = "DE", y = "Not DE", title = "k = 8") +
  scale_x_sqrt() +
  scale_y_sqrt() +
  geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk9 <- merRatk9 %>%
  ggplot(aes(p.x, p.y)) +
  geom_point(size = 1) +
  labs(x = "DE", y = "Not DE", title = "k = 9") +
  scale_x_sqrt() +
  scale_y_sqrt() +
  geom_abline(slope = 1, intercept = 0, col = "blue")
plotRatk10 <- merRatk10 %>%
  ggplot(aes(p.x, p.y)) +
  geom_point(size = 1) +
  labs(x = "DE", y = "Not DE", title = "k = 10") +
  scale_x_sqrt() +
  scale_y_sqrt() +
  geom_abline(slope = 1, intercept = 0, col = "blue")
plotRat <- ggarrange(plotRatk5, plotRatk6, plotRatk7, plotRatk8, 
                     plotRatk9, plotRatk10, nrow = 3, ncol = 2)
annotate_figure(
  plotRat, 
  bottom = "Square root ratio of DE gene kmer counts vs not DE gene kmer counts"
)

Bootstrapping

nBins <- list(length = 10, gc = 10)
binTable_3 <- topTable_3 %>%
  dplyr::select(-gene_biotype, entrezid) %>%
    mutate(
        lengthBins = cut(
            log(aveLen), 
            breaks = quantile(
                log(aveLen), seq(0, nBins$length)/nBins$length
            ),
            labels = paste0("L", seq_len(nBins$length)), 
            include.lowest = TRUE
        ),
        gcBins = cut(
            aveGc, 
            breaks = quantile(
                aveGc, seq(0, nBins$gc) / nBins$gc
            ),
            labels = paste0("GC", seq_len(nBins$gc)), 
            include.lowest = TRUE
        ),
        bothBins = paste(lengthBins, gcBins, sep = "_"),
        bothBins = as.factor(bothBins)
    ) %>%
    group_by(bothBins) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    dplyr::filter(n > 1) %>%
    as_tibble()